## [1] "/home/guanshim/Documents/gitlab/Omics_Integration/DataRaw/hiv_infected_un"

1 Data and Functions

# source functions source to get the
# load_filtered_micro_level function to get clr of RA
source(paste0(dir, "Code/5_29_Generate_filtered_Data_Microbiome.R"))
# clean and transform transcriptome data, subset of genes
source(paste0(dir, "Code/6_5_clean_transcriptome.R"))
# clinical data
source(paste0(dir, "Code/6_5_clean_clinical.R"))
# diagnostic plots and tables
source(paste0(dir, "Code/ref_plots.R"))

########## Datasets ############# phenotype contains ID
clin <- rescaled_cli()
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "No gene list provided, will use the whole Transcriptome"
# import gene lists #
isgs <- as.data.frame(read.delim(paste0(dir, "DataRaw/hiv_infected_un/coreISG")))
genesbeta <- as.data.frame(read.delim(paste0(dir, "DataRaw/hiv_infected_un/genesbeta")))
# LPS Num 16 subjest is missing
which(is.na(clin$LPS))
## [1] 16
#### Transcriptome get gene symbols
rna_isgs <- rescaled_rna(isgs, rlog = T)
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "Use a subset of genes"
isgs_symbol = rna_isgs[[2]] %>% dplyr::select(Symbol)
# rlog transform or not
isgs_rlog <- rna_isgs[[1]]

# ifn-beta
genesbeta <- as.data.frame(read.delim(paste0(dir, "DataRaw/hiv_infected_un/genesbeta")))
rna_genesbeta <- rescaled_rna(genesbeta, rlog = T)
## [1] "Regularized Log Transformation will be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "Use a subset of genes"
# rlog transform or not
genesbeta_rlog <- rna_genesbeta[[1]]
# names
colnames(genesbeta_rlog) <- rna_genesbeta[[2]]$Symbol

#### Microbiome
gen_30_10 <- load_filtered_micro_level_samples("genus", prevalence = 30, 
    RA = 10, wd = "Ubuntu")
## Found "Unclassified" category in input data
## Created new "Other" category.
## Converted 35400 counts to "Other" otu category.
## Remaining OTUS: 270  (Including "Other").
## 
## Prevalence cutoff: 30% (i.e., at least 30% of libaries must be represented to keep OTU)
## Found 'Other' category in input data.
## Collapsed 183 OTUs to 'Other' in OTU table.
## Converted 48322 counts to 'Other' in OTU table.
## Remaining OTUs: 87  (Including 'Other').
## 
## Relative abundance cutoff: 10 % (i.e., at least one library must have RA > 10 % to keep OTU).
## Found "Other" category in input data.
## Collapsed 68 OTUs to "Other" otu category.
## Converted 594402 counts to "Other" otu category.
## Remaining OTUS: 19  (Including "Other").
## 
## Contains 27 subjects/libraries from Explicet OTU file.
micro_clr <- gen_30_10[[2]] %>% as.data.frame()
# rescale to mean 0 and variance 1
mibi <- rescale_microbiome(micro_clr)
# 
anyNA(mibi)
## [1] FALSE
anyNA(isgs_rlog)
## [1] FALSE

2 Density Plots: Hints for weighted SmCCNet

For LPS, the measurement of subject 16 is missing.
Skewness:
> Although SmCCNet does not require normality, it calculates the Pearson correlation between linear combinations of omics features and the phenotype, which assumes finite variances and finite covariance. It is necessary to include a transformation if the data are skewed.

# names
colnames(isgs_rlog) <- isgs_symbol$Symbol
# where na in LPS
n_na = which(is.na(clin$LPS))
# get hints ideas the final clinical parameter to use
LPS = clin %>% select(LPS) %>% na.omit()
df1 = get_corr(mibi[-n_na, ], isgs_rlog[-n_na, ], "Microbiome:Core ISGs")
## [1] "No missing values"
## [1] "Group is the group label"
# stats::cor(clin$LPS, mibi, use = 'pairwise.complete.obs',
# method = 'pearson')
df2 = get_corr(LPS, mibi[-n_na, ], "LPS:Microbiome")
## [1] "No missing values"
## [1] "Group is the group label"
df3 = get_corr(LPS, isgs_rlog[-n_na, ], "LPS:Core ISGs")
## [1] "No missing values"
## [1] "Group is the group label"
# plots
data = rbind(df1, df2, df3)

box_values_group(data, "Pearson Correlations")
## [1] "A column called group and a column called values"

density_values_group(data, "Pearson Correlations")
## [1] "A column called group and a column called values"

# starting points of weights
print("From the boxplot and the Density Plot: Weights 10, 5, 1 for X1X2, LPS:Microbiome, LPS:Core ISGs")
## [1] "From the boxplot and the Density Plot: Weights 10, 5, 1 for X1X2, LPS:Microbiome, LPS:Core ISGs"
# check skewness
check_skew(micro_clr, "Density Plot of CLR of Microbiome RA")

check_skew(rescaled_rna(isgs, rlog = F)[[3]], "Density Plot of without rlog Transcriptome")
## [1] "Regularized Log Transformation will Not be applied!"
## [1] "Check Samples, Match: "
## [1] TRUE
## [1] "Use a subset of genes"

check_skew(rna_isgs[[3]], "Density Plot of rlog Transcriptome")

3 SmCCNet Parameters Engineering

If EdgeCut = 0 (default), then the full module network will be created.

########## parameters ################ by convention, the first Omics
########## is transcriptome and the 2nd is the microbiome this sample
########## size, 25, the paper use 4-fold missing
n_na = which(is.na(clin$LPS))
# check ID order again
sum(rownames(mibi) != rownames(isgs_rlog))
## [1] 0
sum(rownames(isgs_rlog) != clin$ID)
## [1] 0
# unchanged parameters
p1 <- ncol(isgs_rlog)
p2 <- ncol(mibi)
n <- nrow(isgs_rlog[-n_na, ])
AbarLabel <- c(colnames(cbind(isgs_rlog, mibi)))

# # parameters K <- 4 # Number of folds in K-fold CV.  CCcoef
# <- NULL # Unweighted version of SmCCNet.  # Microbiome is
# the 2nd one, thus s2 is larger s1 <- 0.7; s2 <- 0.9 #
# Feature sampling proportions.  SubsamplingNum <- 1000 #
# Number of subsamples.

# 4 fold Cross Validation error
t12 <- read.csv(file = paste0("~/Documents/gitlab/Omics_Integration/DataProcessed/YCore_ISGs_Genus4foldCV/", 
    "TotalPredictionError.csv"))

t12 %>% plyr::arrange(CC.Pred..Error) %>% kable(., caption = "aggregated pseudo canonical correlations and Total Prediction Error")
aggregated pseudo canonical correlations and Total Prediction Error
X l1 l2 Test.CC CC.Pred..Error
18 0.30 0.15 0.6615175 1.197110
24 0.30 0.20 0.6535900 1.203428
6 0.30 0.05 0.6544650 1.211666
12 0.30 0.10 0.6345675 1.231327
26 0.10 0.25 0.7544275 1.276881
2 0.10 0.05 0.7383200 1.292255
8 0.10 0.10 0.7140625 1.308131
14 0.10 0.15 0.7102400 1.310073
36 0.30 0.30 0.5936050 1.312752
20 0.10 0.20 0.7271975 1.319107
5 0.25 0.05 0.5990475 1.341556
23 0.25 0.20 0.5920275 1.348692
11 0.25 0.10 0.5930750 1.351956
17 0.25 0.15 0.5687125 1.379991
30 0.30 0.25 0.4756575 1.422896
35 0.25 0.30 0.5081900 1.435474
29 0.25 0.25 0.4610125 1.517472
21 0.15 0.20 0.5839600 1.541451
3 0.15 0.05 0.5694000 1.576122
9 0.15 0.10 0.5967500 1.580985
27 0.15 0.25 0.4368625 1.581077
15 0.15 0.15 0.5983500 1.621710
10 0.20 0.10 0.3827475 1.622086
16 0.20 0.15 0.3687825 1.635169
4 0.20 0.05 0.3599625 1.661337
22 0.20 0.20 0.3465925 1.666163
28 0.20 0.25 0.4367775 1.749837
32 0.10 0.30 0.2792200 1.766133
31 0.05 0.30 0.1967400 1.817246
34 0.20 0.30 0.6120025 1.820007
33 0.15 0.30 0.6036350 1.947448
19 0.05 0.20 0.4243250 2.016814
1 0.05 0.05 0.3973025 2.017694
13 0.05 0.15 0.4211100 2.025688
7 0.05 0.10 0.4243600 2.040176
25 0.05 0.25 0.3935450 2.069365

3.1 Run SmCCNet

The trimmed module (edge cut = 0.1) is shown below. If a full module does not contain any edge that passes the cut threshold, a message “No edge passes threshold” will be produced. To see all complete module, set edgeCut = 0.
The Lambda1 and Lambda2 are the most important parameters.

# correlation
x <- cbind(isgs_rlog[-n_na, ], mibi[-n_na, ])
corr <- cor(x, method = "pearson")
# Make LPS a vector
LPS = clin %>% select(LPS) %>% na.omit() %>% as.matrix() %>% 
    as.vector()

############# Unweighted ####################
set.seed(123)
W1 <- getRobustPseudoWeights(isgs_rlog[-n_na, ], mibi[-n_na, 
    ], Trait = LPS, Lambda1 = 0.3, Lambda2 = 0.15, s1 = 0.7, 
    s2 = 0.9, NoTrait = FALSE, FilterByTrait = FALSE, SubsamplingNum = 1000, 
    CCcoef = NULL, trace = FALSE)
dim(W1)
## [1]  265 1000
# robust similarity matrix
abar <- getAbar(W1, P1 = p1, FeatureLabel = AbarLabel)
modules <- getMultiOmicsModules(abar, P1 = p1, PlotTree = T, 
    CutHeight = 1 - 0.1^10)

for (idx in 1:length(modules)) {
    plotMultiOmicsNetwork(abar, corr, modules, ModuleIdx = idx, 
        P1 = p1, FeatureLabel = AbarLabel, EdgeCut = 0.1)
}
## [1] "No edge passes threshold."

## [1] "No edge passes threshold."
## [1] "No edge passes threshold."
############## weighted ##################
set.seed(123)
W1 <- getRobustPseudoWeights(isgs_rlog[-n_na, ], mibi[-n_na, 
    ], Trait = LPS, Lambda1 = 0.3, Lambda2 = 0.15, s1 = 0.7, 
    s2 = 0.9, NoTrait = FALSE, FilterByTrait = FALSE, SubsamplingNum = 1000, 
    CCcoef = c(10, 5, 1), trace = FALSE)
dim(W1)
## [1]  265 1000
abar <- getAbar(W1, P1 = p1, FeatureLabel = AbarLabel)
modules <- getMultiOmicsModules(abar, P1 = p1)

for (idx in 1:length(modules)) {
    plotMultiOmicsNetwork(abar, corr, modules, ModuleIdx = idx, 
        P1 = p1, FeatureLabel = AbarLabel, EdgeCut = 0.1)
}
## [1] "No edge passes threshold."

## [1] "No edge passes threshold."
## [1] "No edge passes threshold."

3.1.1 Tuning Results

################ core ISGs ################ correlation
x <- cbind(isgs_rlog[-n_na, ], mibi[-n_na, ])
corr <- cor(x, method = "pearson")
p1 <- ncol(isgs_rlog)
p2 <- ncol(mibi)
n <- nrow(isgs_rlog[-n_na, ])
AbarLabel <- c(colnames(cbind(isgs_rlog, mibi)))
#### 
set.seed(123)
W1 <- getRobustPseudoWeights(isgs_rlog[-n_na, ], mibi[-n_na, 
    ], Trait = LPS, Lambda1 = 0.35, Lambda2 = 0.1, s1 = 0.8, 
    s2 = 0.9, NoTrait = FALSE, FilterByTrait = FALSE, SubsamplingNum = 1000, 
    CCcoef = NULL, trace = FALSE)
dim(W1)
## [1]  265 1000
# robust similarity matrix
abar <- getAbar(W1, P1 = p1, FeatureLabel = AbarLabel)
modules <- getMultiOmicsModules(abar, P1 = p1, PlotTree = T, 
    CutHeight = 1 - 0.1^10)

for (idx in 1:length(modules)) {
    plotMultiOmicsNetwork(abar, corr, modules, ModuleIdx = idx, 
        P1 = p1, FeatureLabel = AbarLabel, EdgeCut = 0)
}

### weighted ####
set.seed(123)
W1 <- getRobustPseudoWeights(isgs_rlog[-n_na, ], mibi[-n_na, 
    ], Trait = LPS, Lambda1 = 0.3, Lambda2 = 0.05, s1 = 0.8, 
    s2 = 0.9, NoTrait = FALSE, FilterByTrait = FALSE, SubsamplingNum = 1000, 
    CCcoef = c(10, 1, 10), trace = FALSE)
dim(W1)
## [1]  265 1000
abar <- getAbar(W1, P1 = p1, FeatureLabel = AbarLabel)
modules <- getMultiOmicsModules(abar, P1 = p1)

for (idx in 1:length(modules)) {
    plotMultiOmicsNetwork(abar, corr, modules, ModuleIdx = idx, 
        P1 = p1, FeatureLabel = AbarLabel, EdgeCut = 0)
}

################## Beta ISGs ############## correlation
x <- cbind(genesbeta_rlog[-n_na, ], mibi[-n_na, ])
corr <- cor(x, method = "pearson")
p1 <- ncol(genesbeta_rlog)
p2 <- ncol(mibi)
n <- nrow(genesbeta_rlog[-n_na, ])
AbarLabel <- c(colnames(cbind(genesbeta_rlog, mibi)))
####### 
set.seed(123)
W1 <- getRobustPseudoWeights(genesbeta_rlog[-n_na, ], mibi[-n_na, 
    ], Trait = LPS, Lambda1 = 0.2, Lambda2 = 0.05, s1 = 0.8, 
    s2 = 0.9, NoTrait = FALSE, FilterByTrait = FALSE, SubsamplingNum = 1000, 
    CCcoef = NULL, trace = FALSE)
dim(W1)
## [1]  425 1000
# robust similarity matrix
abar <- getAbar(W1, P1 = p1, FeatureLabel = AbarLabel)
modules <- getMultiOmicsModules(abar, P1 = p1, PlotTree = T, 
    CutHeight = 1 - 0.1^10)

for (idx in 1:length(modules)) {
    plotMultiOmicsNetwork(abar, corr, modules, ModuleIdx = idx, 
        P1 = p1, FeatureLabel = AbarLabel, EdgeCut = 0)
}

#### weighted ###
set.seed(123)
W1 <- getRobustPseudoWeights(genesbeta_rlog[-n_na, ], mibi[-n_na, 
    ], Trait = LPS, Lambda1 = 0.1, Lambda2 = 0.05, s1 = 0.8, 
    s2 = 0.9, NoTrait = FALSE, FilterByTrait = FALSE, SubsamplingNum = 1000, 
    CCcoef = c(10, 1, 10), trace = FALSE)
dim(W1)
## [1]  425 1000
abar <- getAbar(W1, P1 = p1, FeatureLabel = AbarLabel)
modules <- getMultiOmicsModules(abar, P1 = p1)

for (idx in 1:length(modules)) {
    plotMultiOmicsNetwork(abar, corr, modules, ModuleIdx = idx, 
        P1 = p1, FeatureLabel = AbarLabel, EdgeCut = 0)
}

## Double Check the Results This is to interpret the results.